Presentation

December 9, 2024

Shirley Toribio

Diversity of Horror

Netflix Horror Movies

horror_df
  num         genre
1 125        Horror
2 110     Thrillers
3  38      Comedies
4  19 SciFi&Fantasy
5  12          Cult
6   2 Documentaties
7   2      Romantic

Netflix Horror Movies

Beta vs Normal Distribution

Function 1: PI Interval Calculation

PI <- function(data, coverage_prob){ 
  #Generates a normal prediction interval with an intended coverage probability of coverage_prob based on a vector of numeric data
  n <- length(data)
  lower_tscore <- qt((1-coverage_prob)/2, df = n - 1)
  upper_tscore <- qt(((1-coverage_prob)/2) + coverage_prob, df = n - 1)
  avg <- mean(data)
  stan_d <- sd(data)
  lower_bound <- avg + lower_tscore*stan_d * sqrt(1 + (1/n))
  upper_bound <- avg + upper_tscore*stan_d * sqrt(1 + (1/n))
  return(data.frame(PI_percentage = coverage_prob, lower = lower_bound, upper = upper_bound))
}

Function 2: One simulation of beta-generated data

one_beta_simulation <- function(n, alpha, beta, pi_prop){
  #Assesses prediction accuracy and actual coverage probability of a normal prediction interval when used on a vector of numeric data of size n. The numeric data is generated from a beta distribution with parameters alpha and beta.
  
  cover_df <- PI(rbeta(n, alpha, beta), pi_prop)
  
  cover_prop <- pbeta(cover_df[1, "upper"], alpha, beta) - pbeta(cover_df[1, "lower"], alpha, beta) #this is the proportion of the data's parent distribution that is actually covered by the normal prediction interval generated for said data.
  
  mean_in_interval <- .5 >= cover_df[1, "lower"] & .5 <= cover_df[1,"upper"]
  param_df <- data.frame(cover = cover_prop, alpha = rep(alpha, nrow(cover_df)), beta = rep(beta, nrow(cover_df)), mean_in_interval = mean_in_interval)
  df <- cbind(cover_df, param_df)
  return(df)
}

Function 3: Multiple Beta simulations

beta_sims_n <- function(n){
  #Iterates over a vector of possible alpha = beta values and applies one_beta_simulation to each possible value of alpha/beta. All simulations use data of sample size n.
  df1 <- map(parameters,\(param) one_beta_simulation(n, param, param, pi) ) %>%
  list_rbind()
  df2 <- data.frame(n = rep(n, nrow(df1)))
  df <- cbind(df2, df1)
  return(df)
}

Simulations

     n PI_percentage lower upper cover alpha beta mean_in_interval
1  164      3.141593   NaN   NaN   NaN   123  123               NA
2   67      3.141593   NaN   NaN   NaN    17   17               NA
3   78      3.141593   NaN   NaN   NaN   113  113               NA
4  409      3.141593   NaN   NaN   NaN   195  195               NA
5  337      3.141593   NaN   NaN   NaN    67   67               NA
6   31      3.141593   NaN   NaN   NaN   151  151               NA
7  332      3.141593   NaN   NaN   NaN    65   65               NA
8  200      3.141593   NaN   NaN   NaN   187  187               NA
9   88      3.141593   NaN   NaN   NaN    91   91               NA
10 456      3.141593   NaN   NaN   NaN    55   55               NA

Results

FIN